library(tidyverse)
library(janitor)
Attaching package: ‘janitor’
The following objects are masked from ‘package:stats’:
chisq.test, fisher.test
library(here)
here() starts at /Users/richardmichaelclark/games_data_sales_project/games_data_sales_project
here()
[1] "/Users/richardmichaelclark/games_data_sales_project/games_data_sales_project"
sales_2016 <- read_csv("raw_data/sales-2016-with-ratings.csv")
Parsed with column specification:
cols(
Name = [31mcol_character()[39m,
Platform = [31mcol_character()[39m,
Year_of_Release = [31mcol_character()[39m,
Genre = [31mcol_character()[39m,
Publisher = [31mcol_character()[39m,
NA_Sales = [32mcol_double()[39m,
EU_Sales = [32mcol_double()[39m,
JP_Sales = [32mcol_double()[39m,
Other_Sales = [32mcol_double()[39m,
Global_Sales = [32mcol_double()[39m,
Critic_Score = [32mcol_double()[39m,
Critic_Count = [32mcol_double()[39m,
User_Score = [31mcol_character()[39m,
User_Count = [32mcol_double()[39m,
Developer = [31mcol_character()[39m,
Rating = [31mcol_character()[39m
)
sales_2016
names(sales_2016)
[1] "Name" "Platform" "Year_of_Release" "Genre" "Publisher"
[6] "NA_Sales" "EU_Sales" "JP_Sales" "Other_Sales" "Global_Sales"
[11] "Critic_Score" "Critic_Count" "User_Score" "User_Count" "Developer"
[16] "Rating"
I wanted to check all the variable names. They are in capitals so I want to lose the capitals so I will use the clean_names function from janitor package:
sales_clean_2016 <- clean_names(sales_2016)
glimpse(sales_clean_2016)
Rows: 16,719
Columns: 16
$ name [3m[38;5;246m<chr>[39m[23m "Wii Sports", "Super Mario Bros.", "Mario Kart Wii", "Wii Sports Resort", …
$ platform [3m[38;5;246m<chr>[39m[23m "Wii", "NES", "Wii", "Wii", "GB", "GB", "DS", "Wii", "Wii", "NES", "DS", "…
$ year_of_release [3m[38;5;246m<chr>[39m[23m "2006", "1985", "2008", "2009", "1996", "1989", "2006", "2006", "2009", "1…
$ genre [3m[38;5;246m<chr>[39m[23m "Sports", "Platform", "Racing", "Sports", "Role-Playing", "Puzzle", "Platf…
$ publisher [3m[38;5;246m<chr>[39m[23m "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "N…
$ na_sales [3m[38;5;246m<dbl>[39m[23m 41.36, 29.08, 15.68, 15.61, 11.27, 23.20, 11.28, 13.96, 14.44, 26.93, 9.05…
$ eu_sales [3m[38;5;246m<dbl>[39m[23m 28.96, 3.58, 12.76, 10.93, 8.89, 2.26, 9.14, 9.18, 6.94, 0.63, 10.95, 7.47…
$ jp_sales [3m[38;5;246m<dbl>[39m[23m 3.77, 6.81, 3.79, 3.28, 10.22, 4.22, 6.50, 2.93, 4.70, 0.28, 1.93, 4.13, 7…
$ other_sales [3m[38;5;246m<dbl>[39m[23m 8.45, 0.77, 3.29, 2.95, 1.00, 0.58, 2.88, 2.84, 2.24, 0.47, 2.74, 1.90, 0.…
$ global_sales [3m[38;5;246m<dbl>[39m[23m 82.53, 40.24, 35.52, 32.77, 31.37, 30.26, 29.80, 28.92, 28.32, 28.31, 24.6…
$ critic_score [3m[38;5;246m<dbl>[39m[23m 76, NA, 82, 80, NA, NA, 89, 58, 87, NA, NA, 91, NA, 80, 61, 80, 97, 95, NA…
$ critic_count [3m[38;5;246m<dbl>[39m[23m 51, NA, 73, 73, NA, NA, 65, 41, 80, NA, NA, 64, NA, 63, 45, 33, 50, 80, NA…
$ user_score [3m[38;5;246m<chr>[39m[23m "8", NA, "8.3", "8", NA, NA, "8.5", "6.6", "8.4", NA, NA, "8.6", NA, "7.7"…
$ user_count [3m[38;5;246m<dbl>[39m[23m 322, NA, 709, 192, NA, NA, 431, 129, 594, NA, NA, 464, NA, 146, 106, 52, 3…
$ developer [3m[38;5;246m<chr>[39m[23m "Nintendo", NA, "Nintendo", "Nintendo", NA, NA, "Nintendo", "Nintendo", "N…
$ rating [3m[38;5;246m<chr>[39m[23m "E", NA, "E", "E", NA, NA, "E", "E", "E", NA, NA, "E", NA, "E", "E", "E", …
O want to check for missing variables and n/a so I do this through summary function.
summary(sales_clean_2016)
name platform year_of_release genre publisher
Length:16719 Length:16719 Length:16719 Length:16719 Length:16719
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
na_sales eu_sales jp_sales other_sales global_sales
Min. : 0.0000 Min. : 0.000 Min. : 0.0000 Min. : 0.00000 Min. : 0.0100
1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.0600
Median : 0.0800 Median : 0.020 Median : 0.0000 Median : 0.01000 Median : 0.1700
Mean : 0.2633 Mean : 0.145 Mean : 0.0776 Mean : 0.04733 Mean : 0.5335
3rd Qu.: 0.2400 3rd Qu.: 0.110 3rd Qu.: 0.0400 3rd Qu.: 0.03000 3rd Qu.: 0.4700
Max. :41.3600 Max. :28.960 Max. :10.2200 Max. :10.57000 Max. :82.5300
critic_score critic_count user_score user_count developer
Min. :13.00 Min. : 3.00 Length:16719 Min. : 4.0 Length:16719
1st Qu.:60.00 1st Qu.: 12.00 Class :character 1st Qu.: 10.0 Class :character
Median :71.00 Median : 21.00 Mode :character Median : 24.0 Mode :character
Mean :68.97 Mean : 26.36 Mean : 162.2
3rd Qu.:79.00 3rd Qu.: 36.00 3rd Qu.: 81.0
Max. :98.00 Max. :113.00 Max. :10665.0
NA's :8582 NA's :8582 NA's :9129
rating
Length:16719
Class :character
Mode :character
Things I note from the summary:
year_of_release is character class which is odd. Is it a relevant variable to answer the questions? I think for plotting time series that it could be so I would want to change the year to an integer.
na_sales is North American sales eu_sales is European jp_sales is Japan other_sales is rest of world global_sales is the sum of all the others
sales_clean_2016 %>%
ggplot(aes(x = name, y = global_sales)) +
geom_point()
So it would appear that there is one outlier that done much better than the rest. Let’s try and find out who that is:
sales_clean_2016 %>%
arrange(desc(global_sales))
boxplot(sales_clean_2016$global_sales)
sales_2019 <- read_csv("raw_data/sales-2019.csv")
Parsed with column specification:
cols(
.default = col_double(),
Name = [31mcol_character()[39m,
basename = [31mcol_character()[39m,
Genre = [31mcol_character()[39m,
ESRB_Rating = [31mcol_character()[39m,
Platform = [31mcol_character()[39m,
Publisher = [31mcol_character()[39m,
Developer = [31mcol_character()[39m,
VGChartz_Score = [33mcol_logical()[39m,
Last_Update = [31mcol_character()[39m,
url = [31mcol_character()[39m,
img_url = [31mcol_character()[39m
)
See spec(...) for full column specifications.
sales_clean_2019 <- clean_names(sales_2019)
sales_clean_2019 %>%
ggplot(aes(x = name, y = global_sales)) +
geom_point()
sales_clean_2019 %>%
summary()
rank name basename genre esrb_rating
Min. : 1 Length:55792 Length:55792 Length:55792 Length:55792
1st Qu.:13949 Class :character Class :character Class :character Class :character
Median :27896 Mode :character Mode :character Mode :character Mode :character
Mean :27896
3rd Qu.:41844
Max. :55792
platform publisher developer vg_chartz_score critic_score
Length:55792 Length:55792 Length:55792 Mode:logical Min. : 1.00
Class :character Class :character Class :character NA's:55792 1st Qu.: 6.40
Mode :character Mode :character Mode :character Median : 7.50
Mean : 7.21
3rd Qu.: 8.30
Max. :10.00
NA's :49256
user_score total_shipped global_sales na_sales pal_sales jp_sales
Min. : 2.00 Min. : 0.03 Min. : 0.00 Min. :0.00 Min. :0.00 Min. :0.00
1st Qu.: 7.80 1st Qu.: 0.20 1st Qu.: 0.03 1st Qu.:0.05 1st Qu.:0.01 1st Qu.:0.02
Median : 8.50 Median : 0.59 Median : 0.12 Median :0.12 Median :0.04 Median :0.05
Mean : 8.25 Mean : 1.89 Mean : 0.37 Mean :0.28 Mean :0.16 Mean :0.11
3rd Qu.: 9.10 3rd Qu.: 1.80 3rd Qu.: 0.36 3rd Qu.:0.29 3rd Qu.:0.14 3rd Qu.:0.12
Max. :10.00 Max. :82.86 Max. :20.32 Max. :9.76 Max. :9.85 Max. :2.69
NA's :55457 NA's :53965 NA's :36377 NA's :42828 NA's :42603 NA's :48749
other_sales year last_update url status vgchartzscore
Min. :0.00 Min. :1970 Length:55792 Length:55792 Min. :1 Min. :2.60
1st Qu.:0.00 1st Qu.:2000 Class :character Class :character 1st Qu.:1 1st Qu.:6.80
Median :0.01 Median :2008 Mode :character Mode :character Median :1 Median :7.80
Mean :0.04 Mean :2006 Mean :1 Mean :7.43
3rd Qu.:0.04 3rd Qu.:2011 3rd Qu.:1 3rd Qu.:8.50
Max. :3.12 Max. :2020 Max. :1 Max. :9.60
NA's :40270 NA's :979 NA's :54993
img_url
Length:55792
Class :character
Mode :character
sales_clean_2019
view(sales_clean_2016)
view(sales_clean_2019)
boxplot(sales_clean_2019$global_sales)
names(sales_clean_2016)
[1] "name" "platform" "year_of_release" "genre" "publisher"
[6] "na_sales" "eu_sales" "jp_sales" "other_sales" "global_sales"
[11] "critic_score" "critic_count" "user_score" "user_count" "developer"
[16] "rating"
names(sales_clean_2019)
[1] "rank" "name" "basename" "genre" "esrb_rating"
[6] "platform" "publisher" "developer" "vg_chartz_score" "critic_score"
[11] "user_score" "total_shipped" "global_sales" "na_sales" "pal_sales"
[16] "jp_sales" "other_sales" "year" "last_update" "url"
[21] "status" "vgchartzscore" "img_url"
sales_clean_2016 %>%
select(critic_score) %>%
arrange(desc(critic_score))
sales_clean_2019 %>%
select(critic_score) %>%
arrange(desc(critic_score))
sales_clean_2016 %>%
distinct(rating)
sales_clean_2019 %>%
distinct(esrb_rating)
sales_clean_2019 %>%
distinct(vg_chartz_score)
sales_clean_2019 %>%
filter(total_shipped > 0) %>%
count(global_sales)
sales_clean_2019 %>%
filter(total_shipped > 0) %>%
count(na_sales)
sales_clean_2019 %>%
filter(total_shipped > 0) %>%
count(pal_sales)
sales_clean_2019 %>%
filter(total_shipped > 0) %>%
count(jp_sales)
sales_clean_2019 %>%
filter(total_shipped > 0) %>%
count(other_sales)
sales_clean_2019 %>%
filter(global_sales > 0) %>%
select(global_sales, na_sales, pal_sales, jp_sales, other_sales) %>%
summary
global_sales na_sales pal_sales jp_sales other_sales
Min. : 0.0100 Min. :0.00 Min. :0.000 Min. :0.000 Min. :0.000
1st Qu.: 0.0500 1st Qu.:0.05 1st Qu.:0.020 1st Qu.:0.020 1st Qu.:0.000
Median : 0.1400 Median :0.12 Median :0.050 Median :0.050 Median :0.010
Mean : 0.3935 Mean :0.28 Mean :0.169 Mean :0.114 Mean :0.048
3rd Qu.: 0.4000 3rd Qu.:0.29 3rd Qu.:0.160 3rd Qu.:0.130 3rd Qu.:0.040
Max. :20.3200 Max. :9.76 Max. :9.850 Max. :2.690 Max. :3.120
NA's :5258 NA's :5910 NA's :11244 NA's :3525
?sum
global_sales_tot <- sales_clean_2019 %>%
filter(global_sales > 0) %>%
select(global_sales) %>%
sum()
global_sales_tot
[1] 7096.24
na_sales_tot <- sales_clean_2019 %>%
filter(na_sales > 0) %>%
select(na_sales) %>%
sum()
na_sales_tot
[1] 3572.12
pal_sales_tot <- sales_clean_2019 %>%
filter(pal_sales > 0) %>%
select(pal_sales) %>%
sum()
pal_sales_tot
[1] 2047.76
jp_sales_tot <- sales_clean_2019 %>%
filter(jp_sales > 0) %>%
select(jp_sales) %>%
sum()
jp_sales_tot
[1] 777.56
other_sales_tot <- sales_clean_2019 %>%
filter(other_sales > 0) %>%
select(other_sales) %>%
sum()
other_sales_tot
[1] 694.13
global_sales_tot - (jp_sales_tot + na_sales_tot + other_sales_tot + pal_sales_tot)
[1] 4.67
sales_clean_2019 %>%
filter(last_update > 0) %>%
distinct(last_update)
sales_clean_2019 %>%
distinct(status)
sales_clean_2019 %>%
filter(vgchartzscore > 0) %>%
select(critic_score, user_score, vgchartzscore)
sales_clean_2019 %>%
filter(global_sales == 0) %>%
select(na_sales, pal_sales, jp_sales, other_sales)
sales_clean_2019
sales_clean_2019 %>%
distinct(esrb_rating)
sales_clean_2019 <- sales_clean_2019 %>%
mutate("e", esrb_rating = "ka")
sales_miss <- sales_clean_2019 %>%
filter(na_sales == 0) %>%
mutate("missing_sales", (global_sales - na_sales - pal_sales - jp_sales - other_sales)) %>%
sales_miss
Error in sales_miss(.) : could not find function "sales_miss"
Now I am going to confirm which variable heading in 2016 is what in 2019:
2016 vs 2019 rank only in 2019 Name is Name and basename in 2019 looks like a cleaned version of name Platform is platform Year of release is year ut year of release is a character and I can’t work out how to turn it into a numer so I think we should exclude this variable genre is genre publisher is publisher na-sales is na_sales jp_sales is jp_sales eu_sales is separate; pal_sales is Asia, Africa, Europe, South America and Oceania according to google but according to the data dictionary is=t is EU sales Therefore other_sales in 2016 doesn’t match other_sales in 2019 global_sales is global_sales critic_score numbers 84 to 100 in 2016 critic_score numbers 8.6 to 10 in 2019 (so I will divide that figure in 2016 by 10 to compare) critic_count and user_count appear to be the number of critics and users who have been asked to derive the average review score so I am going to pull them out of the data developer is developer I googled the games rating and this is reference to age appropriateness: https://en.wikipedia.org/wiki/Entertainment_Software_Rating_Board E = Everyone M = Mature 17+ T = Teen 13+ E10+ = everyone ten years old and older K-A = Kids to adults - later renamed everyone AO = adults only 18+ EC = early childhood RP = Rating pending
In 2019 this is callied esrb_rating and KA can also be changed to E
Also included in 2019 data is vg_chartz_score which has no variables so that can excluded from the cleaned dataset Total shipped which appears instead of global sales and has no other split
Unfortunately there are missing variables for all geographic areas but the margin of difference is 4.67m so I am going to create a mutate function to highlight which ones have a missing variable and see if they can be added together
Last_update is an interesting piece of information because it indicates an update since the last dataset and also that there is some longevity to the original release. I am inclined to keep this variable even though there are several with no figure. The reason for this is that I have a theory that the ones with updates are likely to be big sellers.
I will exclude the url column because it’s not relevant for this piece of analysis There is inly one status of ‘1’ so I will be removing that. I need to make the year into a number as integer or numeric (will check) It’s not clear why there wasn’t vgchartz variables on the 2016 dataset because it was launched some time before then. https://en.wikipedia.org/wiki/VGChartz
I will exclude the img irl variable.
I am going to make the year in 2016 data a number:
sales_clean_2016 <- sales_clean_2016 %>%
as.numeric("year_of_release")
Error in function_list[[k]](value) :
(list) object cannot be coerced to type 'double'
That doesn’t seem to work. It does now.
summary(sales_clean_2016)
name platform year_of_release genre publisher
Length:16719 Length:16719 Length:16719 Length:16719 Length:16719
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
na_sales eu_sales jp_sales other_sales global_sales
Min. : 0.0000 Min. : 0.000 Min. : 0.0000 Min. : 0.00000 Min. : 0.0100
1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.0600
Median : 0.0800 Median : 0.020 Median : 0.0000 Median : 0.01000 Median : 0.1700
Mean : 0.2633 Mean : 0.145 Mean : 0.0776 Mean : 0.04733 Mean : 0.5335
3rd Qu.: 0.2400 3rd Qu.: 0.110 3rd Qu.: 0.0400 3rd Qu.: 0.03000 3rd Qu.: 0.4700
Max. :41.3600 Max. :28.960 Max. :10.2200 Max. :10.57000 Max. :82.5300
critic_score critic_count user_score user_count developer
Min. :13.00 Min. : 3.00 Length:16719 Min. : 4.0 Length:16719
1st Qu.:60.00 1st Qu.: 12.00 Class :character 1st Qu.: 10.0 Class :character
Median :71.00 Median : 21.00 Mode :character Median : 24.0 Mode :character
Mean :68.97 Mean : 26.36 Mean : 162.2
3rd Qu.:79.00 3rd Qu.: 36.00 3rd Qu.: 81.0
Max. :98.00 Max. :113.00 Max. :10665.0
NA's :8582 NA's :8582 NA's :9129
rating as.numeric(year_of_release) as.numeric("year_of_release")
Length:16719 Min. :1980 Min. : NA
Class :character 1st Qu.:2003 1st Qu.: NA
Mode :character Median :2007 Median : NA
Mean :2006 Mean :NaN
3rd Qu.:2010 3rd Qu.: NA
Max. :2020 Max. : NA
NA's :269 NA's :16719
Let’s make all variables lower case
sales_clean_2016 <- sales_clean_2016 %>%
mutate_all(.funs = tolower)
head(sales_clean_2016)
#sales_clean_2019 <- sales_clean_2019 %>%
# mutate_all(.funs = tolower)
#head(sales_clean_2019)
I want to fill in missing sales volumes I am going to exclude basename from 2019 results I am going to change critic_score in 2016 to critic_score_2016 and do the same for 2019 I am going to change pal_sales in 2019 to eu_sales_2019 and add the year to
summary(sales_clean_2019)
rank name basename genre esrb_rating
Min. : 1 Length:55792 Length:55792 Length:55792 Length:55792
1st Qu.:13949 Class :character Class :character Class :character Class :character
Median :27896 Mode :character Mode :character Mode :character Mode :character
Mean :27896
3rd Qu.:41844
Max. :55792
platform publisher developer vg_chartz_score critic_score
Length:55792 Length:55792 Length:55792 Mode:logical Min. : 1.00
Class :character Class :character Class :character NA's:55792 1st Qu.: 6.40
Mode :character Mode :character Mode :character Median : 7.50
Mean : 7.21
3rd Qu.: 8.30
Max. :10.00
NA's :49256
user_score total_shipped global_sales na_sales pal_sales jp_sales
Min. : 2.00 Min. : 0.03 Min. : 0.00 Min. :0.00 Min. :0.00 Min. :0.00
1st Qu.: 7.80 1st Qu.: 0.20 1st Qu.: 0.03 1st Qu.:0.05 1st Qu.:0.01 1st Qu.:0.02
Median : 8.50 Median : 0.59 Median : 0.12 Median :0.12 Median :0.04 Median :0.05
Mean : 8.25 Mean : 1.89 Mean : 0.37 Mean :0.28 Mean :0.16 Mean :0.11
3rd Qu.: 9.10 3rd Qu.: 1.80 3rd Qu.: 0.36 3rd Qu.:0.29 3rd Qu.:0.14 3rd Qu.:0.12
Max. :10.00 Max. :82.86 Max. :20.32 Max. :9.76 Max. :9.85 Max. :2.69
NA's :55457 NA's :53965 NA's :36377 NA's :42828 NA's :42603 NA's :48749
other_sales year last_update url status vgchartzscore
Min. :0.00 Min. :1970 Length:55792 Length:55792 Min. :1 Min. :2.60
1st Qu.:0.00 1st Qu.:2000 Class :character Class :character 1st Qu.:1 1st Qu.:6.80
Median :0.01 Median :2008 Mode :character Mode :character Median :1 Median :7.80
Mean :0.04 Mean :2006 Mean :1 Mean :7.43
3rd Qu.:0.04 3rd Qu.:2011 3rd Qu.:1 3rd Qu.:8.50
Max. :3.12 Max. :2020 Max. :1 Max. :9.60
NA's :40270 NA's :979 NA's :54993
img_url
Length:55792
Class :character
Mode :character
sales_clean_2019_sec <- sales_clean_2019 %>%
select(-basename) %>%
select(-vg_chartz_score) %>%
select(-url) %>%
select(-status) %>%
select(-img_url)
summary(sales_clean_2019_sec)
rank name genre esrb_rating platform
Min. : 1 Length:55792 Length:55792 Length:55792 Length:55792
1st Qu.:13949 Class :character Class :character Class :character Class :character
Median :27896 Mode :character Mode :character Mode :character Mode :character
Mean :27896
3rd Qu.:41844
Max. :55792
publisher developer critic_score user_score total_shipped
Length:55792 Length:55792 Min. : 1.00 Min. : 2.00 Min. : 0.03
Class :character Class :character 1st Qu.: 6.40 1st Qu.: 7.80 1st Qu.: 0.20
Mode :character Mode :character Median : 7.50 Median : 8.50 Median : 0.59
Mean : 7.21 Mean : 8.25 Mean : 1.89
3rd Qu.: 8.30 3rd Qu.: 9.10 3rd Qu.: 1.80
Max. :10.00 Max. :10.00 Max. :82.86
NA's :49256 NA's :55457 NA's :53965
global_sales na_sales pal_sales jp_sales other_sales year
Min. : 0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :1970
1st Qu.: 0.03 1st Qu.:0.05 1st Qu.:0.01 1st Qu.:0.02 1st Qu.:0.00 1st Qu.:2000
Median : 0.12 Median :0.12 Median :0.04 Median :0.05 Median :0.01 Median :2008
Mean : 0.37 Mean :0.28 Mean :0.16 Mean :0.11 Mean :0.04 Mean :2006
3rd Qu.: 0.36 3rd Qu.:0.29 3rd Qu.:0.14 3rd Qu.:0.12 3rd Qu.:0.04 3rd Qu.:2011
Max. :20.32 Max. :9.76 Max. :9.85 Max. :2.69 Max. :3.12 Max. :2020
NA's :36377 NA's :42828 NA's :42603 NA's :48749 NA's :40270 NA's :979
last_update vgchartzscore
Length:55792 Min. :2.60
Class :character 1st Qu.:6.80
Mode :character Median :7.80
Mean :7.43
3rd Qu.:8.50
Max. :9.60
NA's :54993
sales_2019_terr <- sales_clean_2019 %>%
mutate("total_shipped_2019", total_shipped) %>%
mutate("global_sales_2019", global_sales) %>%
mutate("na_sales_2019", na_sales) %>%
mutate("eu_sales_2019", pal_sales) %>%
mutate("jp_sales_2019", jp_sales) %>%
mutate("other_sales_2019", other_sales)
sales_2019_terr
sales_clean_2019
sales_2019_conf <- sales_2019_terr %>%
select(-global_sales) %>%
select(-na_sales) %>%
select(-pal_sales) %>%
select(-jp_sales) %>%
select(-other_sales)
sales_2019_conf
sales_clean_2016
summary(sales_clean_2016)
name platform year_of_release genre publisher
Length:16719 Length:16719 Length:16719 Length:16719 Length:16719
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
na_sales eu_sales jp_sales other_sales global_sales
Min. : 0.0000 Min. : 0.000 Min. : 0.0000 Min. : 0.00000 Min. : 0.0100
1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.0600
Median : 0.0800 Median : 0.020 Median : 0.0000 Median : 0.01000 Median : 0.1700
Mean : 0.2633 Mean : 0.145 Mean : 0.0776 Mean : 0.04733 Mean : 0.5335
3rd Qu.: 0.2400 3rd Qu.: 0.110 3rd Qu.: 0.0400 3rd Qu.: 0.03000 3rd Qu.: 0.4700
Max. :41.3600 Max. :28.960 Max. :10.2200 Max. :10.57000 Max. :82.5300
critic_score critic_count user_score user_count developer
Min. :13.00 Min. : 3.00 Length:16719 Min. : 4.0 Length:16719
1st Qu.:60.00 1st Qu.: 12.00 Class :character 1st Qu.: 10.0 Class :character
Median :71.00 Median : 21.00 Mode :character Median : 24.0 Mode :character
Mean :68.97 Mean : 26.36 Mean : 162.2
3rd Qu.:79.00 3rd Qu.: 36.00 3rd Qu.: 81.0
Max. :98.00 Max. :113.00 Max. :10665.0
NA's :8582 NA's :8582 NA's :9129
rating
Length:16719
Class :character
Mode :character
sales_2016_conf <- sales_clean_2016 %>%
select(-year_of_release) %>%
select(-critic_count) %>%
select(-user_count) %>%
select(-rating)
sales_2016_conf
sales_clean_2016_sec <- sales_2016_conf %>%
select(-user_score)
Do a full join:
still need to get rid of pal_sales and make it eu_sales
sales_full <- full_join(sales_clean_2019_sec, sales_clean_2016_sec)
Joining, by = c("name", "genre", "platform", "publisher", "developer", "critic_score", "global_sales", "na_sales", "jp_sales", "other_sales")
sales_full
sales_2016_user <- sales_2016_conf %>%
select(-user_score)
sales_2016_user %>%
select(-rating)
sales_2019_conf
view(sales_full)
sales_full %>%
filter(global_sales > 0) %>%
select(total_shipped)
sales_full
mutate_all(sales_full, "global_shipped" = "total_shipped" + "global_sales")
Error in "total_shipped" + "global_sales" :
non-numeric argument to binary operator
mutate(sales_full, global_shipped = (total_shipped + global_sales))
mutate(sales_full, "global_shipped", total_shipped + global_sales)
mutate(sales_full, "global_shipped", (total_shipped + global_sales))
sales_full
here()
[1] "/Users/richardmichaelclark/games_data_sales_project/games_data_sales_project"
view(sales_full)
sales_full_longer <- sales_full %>%
pivot_longer(cols = c("global_sales", "na_sales", "pal_sales", "jp_sales", "other_sales"),
names_to = "sales_geo",
values_to = "sales_m")
write.csv(sales_full_longer, "sales_full_cleaned.csv")